#! /usr/bin/env python
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy.selector import Selector
from scrapy.http import Request
from ntes.items import NtesItem
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from datetime import *
import sys
reload(sys)
sys.setdefaultencoding('utf-8')


class Spider(CrawlSpider):
    name = "ntes"
    allowed_domains = ["edu.163.com"]
    start_urls = ['http://edu.163.com/']
    date_now = date.today().strftime('%y-%m-%d').split('-')
    allow_rule = r"/%y/%m\d+/\d+/*".replace('%y', date_now[0]).replace('%m', date_now[1])
    rules = (
        Rule(
            LinkExtractor(allow=allow_rule),
            callback="parse_news",
            follow=True
            # follow=ture定义了是否再爬到的结果上继续往后爬
        ),
    )

    def parse_news(self, response):
        item = NtesItem()
        item['info_id'] = response.url.strip().split('/')[-1].split('.')[0]
        self.get_title(response, item)
        self.get_keywords(response, item)
        self.get_description(response, item)
        self.get_author(response, item)
        self.get_time(response, item)
        self.get_url(response, item)
        self.get_news_from(response, item)
        self.get_from_url(response, item)
        self.get_text(response, item)
        self.get_region(response, item)
        self.get_comment_count(response, item)
        self.get_section(response, item)

        return item

    def get_title(self, response, item):
        title = response.xpath("/html/head/title/text()").extract()
        if title:
            item['title'] = title[0][:-5]

    def get_keywords(self, response, item):
        # title = response.xpath("/html/head/title/text()").extract()
        keywords = response.xpath("//meta[@name='keywords']/@content").extract()
        if keywords:
            item['tag'] = keywords[0]

    def get_description(self, response, item):
        description = response.xpath("//meta[@name='description']/@content").extract()
        if description:
            item['news_description'] = description[0]

    def get_author(self, response, item):
        author = response.xpath("//meta[@name='author']/@content").extract()
        if author:
            item['author'] = author[0]

    def get_time(self, response, item):
        # source = response.xpath("//div[@class='ep-time-soure cDGray']/text()").extract()
        news_time = response.xpath("//div[@class='post_time_source']/text()").extract()
        if news_time:
            # item['news_time'] = source[0][9:-5]
            online_time = news_time[0].strip()[0:-4]
            online_time = online_time.replace('-', '').replace(' ', '').replace(':', '')
            online_time = int(online_time)
            item['online_time'] = online_time

    def get_news_from(self, response, item):
        # news_from = response.xpath("//div[@class='ep-time-soure cDGray']/a/text()").extract()
        news_from = response.xpath("//div[@class='post_time_source']/a/text()").extract()
        if news_from:
            item['news_from'] = news_from[0]

    def get_from_url(self, response, item):
        # from_url = response.xpath("//div[@class='ep-time-soure cDGray']/a/@href").extract()
        from_url = response.xpath("//div[@class='post_time_source']/a/@href").extract()
        if 'http' in from_url[0]:
            item['source'] = from_url[0]
        else:
            item['source'] = None

    def get_text(self, response, item):
        news_body = response.xpath("//div[@id='endText']/p/text()").extract()
        if news_body:
            item['news_body'] = news_body

    def get_url(self, response, item):
        news_url = response.url
        if news_url:
            item['link'] = news_url

    def get_section(self, response, item):
        section = response.xpath("//div[@class='post_crumb']/a/text()").extract()
        # print region[1]
        if section:
            item['section'] = section[1]

    def get_comment_count(self, response, item):
        comment_count = response.xpath("//div[@class='post_tie_top']/a/text()").extract()
        # print comment_count[0]
        if comment_count:
            item['comment_count'] = comment_count[0]
    
    def get_region(self, response, item):
        region = response.xpath('//*[@id="epContentLeft"]/div[1]/text()[2]').extract()
        if '(' in region[0]:
            item['region'] = region[0].replace(' ', '').replace('(', '').replace(')', '')
        else:
            item['region'] = None
